#delimit;
local fileloc = "~/KMS_REPLICATION";
set logtype text;
capture log close templog;

log using `fileloc'/log_files/importing_daily_birth_data.txt, name(templog) replace;
set more off;
clear all;
pause on;

**XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
**XXXXXXXXXXXXXXXXXXXX LOCALS FOR WHOLE PROGRAM XXXXXXXXXXXXXXXXXXXXXXXXX;
**XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;

foreach year in 2002 2003 2004 2005 2006 {;

	infile using `fileloc'/dicts/birth_dict_`year'.dct, clear;
		
	************************ FORMING OBSERVABLES *************************;	
	
	** GENDER;
		
	gen male = .;
	replace male = 1 if (child_sex == 1);
	replace male = 0 if (child_sex == 2);
	drop child_sex;
	tab male, missing;

	** MOTHER RACE;
	replace mother_race = . if mother_race == 99 | mother_race == 98;
		drop if mother_race == .;
	gen white = (mother_race == 10);		
	gen black = (mother_race == 20);
	gen asian = (mother_race >= 40 & mother_race <=49);
	gen other_race = (mother_race >= 51);
	** Many qualifiers under "hispanic" due how data are recorded. This prevents any one individual begin counted as two different races;
	gen hisp = (mother_hisp ~= 1 & black ~= 1 & asian ~= 1 & other_race ~= 1);

	** FATHER RACE;
	replace father_race = . if father_race == 99 | father_race == 98;
		drop if father_race == .;

	** PARENTS DIFFERENT RACES;
	gen dif_race_parents = (mother_race ~= father_race);
		drop if dif_race_parents == .;
	
	** MOTHER EDUCATION;
	destring mother_educ, replace force;
	replace mother_educ = . if mother_educ == 99;
	drop if mother_educ == .;
	
	** Method changes in 2006;
	if `year' == 2006 {;
		replace mother_educ = . if mother_educ == 9;
		drop if mother_educ == .;
		gen HS_dropout = (mother_educ < 3);
		gen HS_grad = (mother_educ >= 3);
		gen college_grad = (mother_educ >=  6);
	};	
	
	else {;
		gen HS_dropout = (mother_educ < 12);
		gen HS_grad = (mother_educ >= 12);
		gen college_grad = (mother_educ >= 16) /* Note that this assumes that if they have 4 years of college education or more, they have graduated */;
	};
	
	** MOTHER AGE;
	replace mother_age = . if mother_age == 99;
		drop if mother_age == .;
	
	** MULTIPLE BIRTHS;
	gen twins = (multi_birth == 2);
		replace twins = . if multi_birth == 9;
		drop if twins == .;
	gen trip_or_more = (multi_birth >= 3);
		replace trip_or_more = . if multi_birth == 9;
		drop if trip_or_more == .;

	** GOVERNMENT INSURACE;
	gen medicaid = (payment == 2 | payment == 13) /* Medi-cal with and without California Parinatal Services Program */;
	
		replace medicaid = . if payment == 99;
		drop if medicaid == .;

	** PRENATAL CARE;
	
	destring prenatal_start, replace force;
		drop if prenatal_start == .;
	
	if `year' < 1999 {;
		gen fetal_death = (birth_type == "X") ;
	};
	else {;
		gen fetal_death = (birth_type == 2);
	};
	
	**NOTE: For records up to 1997, fetal death is most easily found in the section on "age at birth", which is "X" for fetal death. For records from 1999 onward, fetal death is given in the birth type classification as "2");
	
	**GESTATION LENGTH;
	destring gest_length, replace force;
	
	** DROP USELESS STRING VARIABLES;
	drop birth_type;
	capture drop death_cause;
	
	**changing coded missing to "." ;
	replace birth_wght = . if birth_wght == 9999 | birth_wght == 9998;
	replace mother_zip = . if mother_zip == 99999 ;
	replace prenatal_visits = . if prenatal_visits == 99;
	replace gest_length = . if gest_length == 999;
	replace parity = . if parity == 99;
	replace multi_birth = . if multi_birth == 9;

	** Creating variables;
	gen care_first_tri = (prenatal_start <= 3 & prenatal_start ~= 0);
	gen low_weight = (birth_wght < 2500) /* general def of low birth weight */;
	gen teen_mom = (mother_age < 19);
	gen age19_25 = (mother_age > 18 & mother_age <= 25);
	gen age26_30 = (mother_age >= 26 & mother_age <= 30);
	gen age31_35 = (mother_age >= 31 & mother_age <= 35);
	gen age36up = (mother_age >= 36);
	gen first_born = (parity == 1);
	gen second_born = (parity == 2);
	gen third_born = (parity == 3);
	gen fourth_or_more = (parity >= 4 & parity ~= . & parity ~= 99 & parity ~= 98);
				
	gen premature = (gest_length <= 245) /*define premature as more than 3 weeks early */;
	drop if gest_length < (26*7) /* only include those who have three trimesters */;
	drop if gest_length > (42*7) /* drop all people with gestation length of over 315 days, some obs with gestation lengths that are not possible */;

	** GENERATE VARIABLES OF INTEREST;
	
	** GENERATE SINGLE VARIABLE FOR DAY OF BIRTH AND DAY OF DEATH;
	foreach act in birth death {;
		replace `act'_year = `act'_year + 1900 if `act'_year > 10;
		replace `act'_year = `act'_year + 2000 if `act'_year <= 10;
		gen `act'_date = .;
		replace `act'_date = mdy(`act'_month, `act'_day, `act'_year);
		format `act'_date %d;
	};
	
	** DROP BIRTH/DEATH TIMES THAT MAKE NO SENSE;
	drop if birth_date == .;
	drop if death_date < birth_date /* died before born? */;
	gen age_at_death = .;
	replace age_at_death = (death_date - birth_date) if death_date ~= .;
	drop if age_at_death > 365 & age_at_death ~= .;
	tab age_at_death, missing;
	
	count;
	
	save `fileloc'/data/birth_data/birth_data_`year'.dta, replace;
			
};


** MERGE ALL INDIVIDUAL YEARS TOGETHER;

use `fileloc'/data/birth_data/birth_data_2002.dta, clear;

foreach year in 2003 2004 2005 2006 {;
	append using `fileloc'/data/birth_data/birth_data_`year'.dta;
};

** Keep only California mothers:
keep if mother_state == 5;

** Drop mothers with missing data;
drop if mother_zip == 0;
foreach var in mother_zip birth_year birth_date gest_length prenatal_visits prenatal_start male twins trip_or_more HS_dropout HS_grad college_grad fetal_death white black hisp asian other_race medicaid prenatal_start care_first_tri low_weight birth_wght teen_mom age19_25 age26_30 age31_35 age36up premature first_born second_born third_born fourth_or_more {;
	drop if `var' == .;
};

keep mother_zip birth_year birth_date death_date gest_length prenatal_visits prenatal_start male twins trip_or_more HS_dropout HS_grad college_grad fetal_death white black hisp asian other_race medicaid prenatal_start care_first_tri low_weight birth_wght teen_mom age19_25 age26_30 age31_35 age36up premature age_at_death first_born second_born third_born fourth_or_more death_code;

** NOTE: death codes change from from 1999 on. Prior to that, they are numerical, and the number used is the specific cause of death category (9th edition ICD). From 1999 onward, the group number cause of death is used, which is less specific but still works for our purposes;

** Current version uses only later data, but code remains for consistency with NBER version;
gen death_number = . ;
destring death_code, gen(death_code_nostring) force;
** Didn't die = 0;
replace death_number = 0 if death_date == . & fetal_death == 0;
***** Codes prior to 1999;
** Motor vehicle traffic accidents = 1;
replace death_number = 1 if death_date ~= . & birth_year <= 1997 & death_code_nostring >= 810 & death_code_nostring <= 825;
** External but not MVA, suicides and homicides = 2;
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 800 & death_code_nostring <= 807);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 826 & death_code_nostring <= 838);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 840 & death_code_nostring <= 848);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 850 & death_code_nostring <= 858);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 930 & death_code_nostring <= 949);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 860 & death_code_nostring <= 869);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 870 & death_code_nostring <= 876);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 878 & death_code_nostring <= 879);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 880 & death_code_nostring <= 888);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 890 & death_code_nostring <= 899);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 900 & death_code_nostring <= 929);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 950 & death_code_nostring <= 959);
replace death_number = 2 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 960 & death_code_nostring <= 969);
** Specific cardio, circulatory, or respiratory death = 3;
replace death_number = 3 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 460 & death_code_nostring <= 519);
replace death_number = 3 if death_date ~= . & birth_year <= 1997 & (death_code_nostring >= 390 & death_code_nostring <= 459);

**** Codes 1999 and beyond;
** Motor vehicle traffic accidents = 1;
replace death_number = 1 if death_date ~= . & birth_year >= 1999 & death_code_nostring >= 296 & death_code_nostring <= 306;
** External but not MVA, suicides and homicides = 2;
replace death_number = 2 if death_date ~= . & birth_year >= 1999 & death_code_nostring >= 307;
** Specific cardio, circulatory, or respiratory death = 3;
replace death_number = 3 if death_date ~= . & birth_year >= 1999 & (death_code_nostring >= 192 & death_code_nostring <= 214);
replace death_number = 3 if death_date ~= . & birth_year >= 1999 & (death_code_nostring >= 156 & death_code_nostring <= 191);

** Other kind of death = 4;
replace death_number = 4 if death_date ~= . & death_number == .;

** Generate variable that indicates general timeframe of death (early or late in cycle);
gen death_timeframe = .;
replace death_timeframe = 0 if death_date == . & fetal_death == 0;
replace death_timeframe = 1 if death_date ~= . & death_timeframe == . & age_at_death == 0;
replace death_timeframe = 2 if death_date ~= . & death_timeframe == . & age_at_death <= 6;
replace death_timeframe = 3 if death_date ~= . & death_timeframe == . & age_at_death <= 30;
replace death_timeframe = 4 if death_date ~= . & death_timeframe == . & death_date ~= .;

gen birth_id = _n /* will be used to identify individual births*/;

sort mother_zip;

gen birth_week = wofd(birth_date);
gen death_week = wofd(death_date);

gen lifespan = .;
replace lifespan = death_week - birth_week if death_week ~= .;
replace lifespan = 53 if death_week == .; 
replace lifespan = . if fetal_death == 1;

compress;

save `fileloc'/data/birth_data/daily_births_and_deaths.dta, replace;

log close templog;

